library(e1071)
## Warning: package 'e1071' was built under R version 4.0.3
library(DMwR)
## Warning: package 'DMwR' was built under R version 4.0.3
## Loading required package: lattice
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(distr)
## Warning: package 'distr' was built under R version 4.0.3
## Loading required package: startupmsg
## Warning: package 'startupmsg' was built under R version 4.0.3
## Utilities for Start-Up Messages (version 0.9.6)
## For more information see ?"startupmsg", NEWS("startupmsg")
## Loading required package: sfsmisc
## Warning: package 'sfsmisc' was built under R version 4.0.3
## Object Oriented Implementation of Distributions (version 2.8.0)
## Attention: Arithmetics on distribution objects are understood as operations on corresponding random variables (r.v.s); see distrARITH().
## Some functions from package 'stats' are intentionally masked ---see distrMASK().
## Note that global options are controlled by distroptions() ---c.f. ?"distroptions".
## For more information see ?"distr", NEWS("distr"), as well as
## http://distr.r-forge.r-project.org/
## Package "distrDoc" provides a vignette to this package as well as to several extension packages; try vignette("distr").
##
## Attaching package: 'distr'
## The following objects are masked from 'package:stats':
##
## df, qqplot, sd
library(caret)
## Warning: package 'caret' was built under R version 4.0.3
## Loading required package: ggplot2
1(a)
1(b)
2(a)
# in summary, first hidden state has probability 0.619 when the second state is fair, and 0.153 when the second state is weighted. Last hidden state has probability 0.765 when the state prior is fair, and 0.265 when the state prior is weighted.
# in general, we can use an algorithm to calculate rest of the "in-between" probabilities of the hidden states:
outcome <- c(4, 4, 5, 2, 2, 4, 6, 6, 1, 4, 1, 1, 3, 5, 5, 2, 5, 4, 2, 1)
weight <- data.frame("dice"=c(1,2,3,4,5,6), "probability"=c(2/13, 2/13, 1/13, 4/13, 2/13, 2/13))
# build function based on formula for finding pi = fair for inner states
find_p_current <- function(state_before, state_after, index_current) {
if (state_before == 0 & state_after == 0) {
p_current <- (0.75^2/6)/(0.75^2/6 + 0.25^2*weight[weight$dice==outcome[index_current], "probability"])
} else if (state_before != state_after) {
p_current <- (1/6)/(1/6 + weight[weight$dice==outcome[index_current], "probability"])
} else {
p_current <- (0.25^2/6)/(0.25^2/6 + 0.75^2*weight[weight$dice==outcome[index_current], "probability"])
}
}
# make a table: column 1 is the index of the current state from 2 to 19. For the rest of the columns, the header indicates the assumed states before and after the current state. For example, the "fair_fair" column assumes that the state before the current state is fair and the state after is fair.
fair_fair <- c()
fair_weight <- c()
weight_fair <- c()
weight_weight <- c()
for (index_current in seq(2, 19, 1)) {
fair_fair[index_current] <- find_p_current(0, 0, index_current)
fair_weight[index_current] <- find_p_current(0, 1, index_current)
weight_fair[index_current] <- find_p_current(1, 0, index_current)
weight_weight[index_current] <- find_p_current(1, 1, index_current)
}
p_inner_states <- data.frame(state_index=seq(1,19,1), fair_fair, fair_weight, weight_fair, weight_weight)[-1,]
# table demonstrates the probabilities that the hidden states are fair given all the conditions
p_inner_states
## state_index fair_fair fair_weight weight_fair weight_weight
## 2 2 0.8297872 0.3513514 0.3513514 0.05676856
## 3 3 0.9069767 0.5200000 0.5200000 0.10743802
## 4 4 0.9069767 0.5200000 0.5200000 0.10743802
## 5 5 0.9069767 0.5200000 0.5200000 0.10743802
## 6 6 0.8297872 0.3513514 0.3513514 0.05676856
## 7 7 0.9069767 0.5200000 0.5200000 0.10743802
## 8 8 0.9069767 0.5200000 0.5200000 0.10743802
## 9 9 0.9069767 0.5200000 0.5200000 0.10743802
## 10 10 0.8297872 0.3513514 0.3513514 0.05676856
## 11 11 0.9069767 0.5200000 0.5200000 0.10743802
## 12 12 0.9069767 0.5200000 0.5200000 0.10743802
## 13 13 0.9512195 0.6842105 0.6842105 0.19402985
## 14 14 0.9069767 0.5200000 0.5200000 0.10743802
## 15 15 0.9069767 0.5200000 0.5200000 0.10743802
## 16 16 0.9069767 0.5200000 0.5200000 0.10743802
## 17 17 0.9069767 0.5200000 0.5200000 0.10743802
## 18 18 0.8297872 0.3513514 0.3513514 0.05676856
## 19 19 0.9069767 0.5200000 0.5200000 0.10743802
2(b)
# define a function that returns TRUE when p < p(pi_i = fair), and FALSE when p >(pi_i = fair)
set.seed(12)
pdist <- function(p){return(runif(1)<p)}
# initialize
initial <- rep(TRUE, 20)
pi <- c()
cumulative <- data.frame()[1:20,]
# gibbs sampling
for (i in seq(1, 10500, 1)) {
if (initial[2] == TRUE) {
pi[1] <- pdist(0.619)
} else {
pi[1] <- pdist(0.153)
}
if (initial[19] == TRUE) {
pi[20] <- pdist(0.765)
} else {
pi[20] <- pdist(0.265)
}
for (j in seq(2, 19, 1)) {
if (initial[j - 1] == TRUE & initial[j + 1] == TRUE) {
pi[j] <- pdist(p_inner_states[p_inner_states$state_index==j, 2])
} else if (initial[j - 1] != initial[j + 1]) {
pi[j] <- pdist(p_inner_states[p_inner_states$state_index==j, 3])
} else {
pi[j] <- pdist(p_inner_states[p_inner_states$state_index==j, 5])
}
}
initial <- pi
cumulative[i] <- pi
}
# check the probability that each pi is fair
result <- rowSums(cumulative[, -500])/10000
plot(result)
3(a)
# find test and training sets from Assign2
gradAdmit <- read.csv(here::here("gradAdmit.csv"))
set.seed(12345)
n <- nrow(gradAdmit)
sample <- sample.int(n = n, size = floor(.2 * n), replace = F)
train <- gradAdmit[-sample,]
test <- gradAdmit[sample,]
# check class balance
table(train$admit)
##
## 0 1
## 221 99
sum(train$admit)/nrow(train)
## [1] 0.309375
table(test$admit)
##
## 0 1
## 52 28
sum(test$admit)/nrow(test)
## [1] 0.35
# in both sets of data, the # of rejects is much higher than the # of admits. In the training set, 31% are admitted. In the test set, 35% students are admitted.
3(b)
# best model from Assign2 and its predictions on the test set
retrain <- svm(formula = factor(admit)~.,
data = train,
scale = T,
kernel = "radial",
gamma = 0.2,
cost = 8
)
repredict <- predict(retrain, test, type="reponse")
table(factor(test$admit), repredict) # rows are actual values, columns are predicted values
## repredict
## 0 1
## 0 48 4
## 1 23 5
# precision = 0.56, recall= 0.18, specificity= 0.92
3(c)
# to achieve a 1:1 ratio, we need 221-99 = 122 more samples, so that's 100% * 122/99 = 123% of the current minority sample.
train$admit <- factor(train$admit)
newData = SMOTE(admit~., train, perc.over=123, perc.under=0)
# combine with original training set
newTrain <- train[train$admit==0,]
newTrain <- rbind(newTrain, newData)
table(newTrain$admit)
##
## 0 1
## 221 198
# the new class balance is approx. 1:1.
3(d)
# retrain the model on the training set
reretrain <- svm(formula = admit~.,
data = newTrain,
scale = T,
kernel = "radial",
gamma = 0.2,
cost = 8
)
rerepredict <- predict(reretrain, test, type="reponse")
table(factor(test$admit), rerepredict)
## rerepredict
## 0 1
## 0 38 14
## 1 12 16
# new precision is 0.53, recall is 0.57, specificity is 0.73.
4(c) Because \(g^2(x)p(x)>0\) when \(x\geq10\pi\) and equals \(0\) otherwise, we want to find a \(p^*(x)\) that’s “larger” than \(p(x)\) when \(x\geq10\pi\) and equals \(0\) when \(x<10\pi\). We can find this function by setting \(p^*(x)\) as \(p(x)\) with a horizontal shift to the right (in other words, \(p^*(x)=e^{-x-c}\) for some positive constant \(c\)) and set it as \(0\) for \(x<10\pi\). In practice, when we try to sample from it in R, the results are not as promising due to software restrictions. Therefore, we shift \(p*(x)\) to the left by \(10\pi\) (\(p*(X)=e^{-x+10\pi}\)) for \(x\geq0\), and set it to \(0\) for \(x<0\).
4(d)
set.seed(1234)
sample <- rexp(10^6, 1)
sum <- 0
for(i in sample){
sum <- sum + sin(i) * exp(-10*pi)
}
estimate <- sum/10^6
estimate # estimate of the integral is 1.135857e-14, which is very close to the real value (1.135551e-14).
## [1] 0.5001350919 0.5001350919 0.5001350919 0.0000227061 0.5001350919
## [6] 0.0000227061 0.5001350919 0.0000227061 0.0000227061 0.0000227061
## [11] 0.5001350919 0.0000227061 0.5001350919 0.5001350919 0.5001350919
## [16] 0.0000227061 0.0000227061 0.5001350919 0.0000227061 0.5001350919